Statement Of Contribution:

Assignment 1: Dinesh Sundaramoorthy (dinsu875)
Assignment 2: Jin Yan (jinya425)

library(ggplot2)
library(plotly)
## 
## 载入程辑包:'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(MASS)
## 
## 载入程辑包:'MASS'
## The following object is masked from 'package:plotly':
## 
##     select
library(gridExtra)

Assignment One

data <- read.csv("olive.csv", sep = ",", header = TRUE)

1.1

scatter_plot1 <- ggplot(data, aes(x=oleic, y=palmitic,
                            color = linolenic)) +
  geom_point() +
  ggtitle("Palmitic vs Oleic") +
  theme_minimal()

scatter_plot1

data$linolenic_new <- cut_interval(data$linolenic, n=4) # calculate the interval range and assign this column to the original table
scatter_plot2 <- ggplot(data, aes(x=oleic, y=palmitic, color = linolenic_new)) + 
  geom_point() +
  ggtitle("Palmitic on Oleic colored by linolenic_new") +
  theme_minimal()

scatter_plot2

Answer: The second picture is more convenient for analysis. Compared with saturation, the human eye is more sensitive to changes in hue.We can perceive more levels of hue compare to that of saturation.

1.2

scatter_plot3_col <- ggplot(data, aes(oleic, palmitic)) +
  geom_point(aes(color=linolenic_new))+ # the reason this code is different from above is we have to do changes in this part in the following two code snippet.
  ggtitle("Palmitic on Oleic colored by linolenic_new") +
  theme_minimal()

scatter_plot3_col

scatter_plot4_size <- ggplot(data, aes(oleic, palmitic)) +
  geom_point(aes(size=linolenic_new)) +
  ggtitle("Oleic vs Palmitic sized by linolenic_new") +
  theme_minimal()

scatter_plot4_size
## Warning: Using size for a discrete variable is not advised.

scatter_plot5_angle <- ggplot(data, aes(oleic, palmitic)) +
  geom_point() +
  geom_spoke(aes(angle=as.numeric(linolenic_new),radius= 40)) +
  ggtitle("Palmitic vs Oleic angle by linolenic_new") +
  theme_minimal()

scatter_plot5_angle

Answer:It’s the most difficult to differentiate the category in the second plot(i.e. by size of the point),And it is easiest to differentiate the category by color.Consider of the perception metrics,the Hue feature has 3.1 bits of Channel capacity,meanwhile the Line orientation has 3 bit and the Size of squares has only 2.2 bits,which correspond the intuition.

1.3

scatter_plot6 <- ggplot(data, aes(x= eicosenoic, y = oleic, color = Region)) + 
  geom_point() +
  ggtitle(" Oleic vs Eicosenoic colored by Region") +
  theme_minimal()

scatter_plot6

scatter_plot7 <- ggplot(data, aes(x=eicosenoic, y = oleic , color = as.factor(Region)))+
  geom_point() +
  ggtitle("Oleic vs Eicosenoic colored by factored Region") +
  theme_minimal()

scatter_plot7

Answer: In the first picture, it is easy to distinguish the region of each sample point, but in the second picture, you can distinguish them faster.preattentive mechanism makes it possible.A unique visual property in the target makes it easy to be noticed,like Hue.But Value and saturation are not preattentive feature,so it will be slower.

1.4

data$linoleic_new <- cut_interval(data$linoleic, n=3)
data$palmitic_new <- cut_interval(data$palmitic, n=3)
data$palmitoleic_new <- cut_interval(data$palmitoleic, n=3)
scatter_plot8 <- ggplot(data, aes(x=eicosenoic, y= oleic)) + 
  geom_point(aes(color=linoleic_new, size=palmitoleic_new, shape=palmitic_new)) +
  ggtitle("Oleic vs Eicosenoic") +
  theme_minimal()

scatter_plot8
## Warning: Using size for a discrete variable is not advised.

Answer: It’s difficult to differentiate 27 types of observations,cause you need to compare 3 features between each point. Conjunction of features requires serial search between maps, it cannot be preattentively classified.

1.5

scatter_plot9 <- ggplot(data, aes(x=oleic, y=eicosenoic)) + 
  geom_point(aes(color=Region, size=palmitoleic_new, shape=palmitic_new))

scatter_plot9
## Warning: Using size for a discrete variable is not advised.

Answer: It’s because the distribution of hues is very obvious, and human perceive hues in the pre attention stage, it requires little effort or even realizes its occurrence and can only detect independent features. ## 1.6

# the following is different from the template but it can work.
plot_ly(data = data, labels=~Area, type = 'pie', showlegend = TRUE, textinfo = "text") %>%
  layout(title = 'Proportion of Oils from different regions',
         xaxis = list(showgrid = F, zeroline = F, showticklables = F),
         yaxis = list(showgrid = F, zeroline = F, showticklables = F))

Answer: In the absence of labels on the individual sections, it can be challenging to discern variations in proportions among the different areas. This highlights the issue of relative judgment.

1.7

scatter_plot10 <- ggplot(data, aes(x = eicosenoic, y = linoleic)) +
  geom_density_2d()+
  theme_minimal() +
  ggtitle(" CONTOUR PLOT - linoleic vs eicosenoic")

scatter_plot10

scatter_plot11 <- ggplot(data, aes(x=eicosenoic, y = linoleic))+
  geom_point() +
  ggtitle("SCATTER PLOTlinoleic vs Eicosenoic ") +
  theme_minimal()

scatter_plot11

Answer: The 2d-density contour plot have lines in an area eventhough there’are not sample points,which can be misleading. # Assignment Two

2.1

my_data = readxl::read_xlsx('baseball-2016.xlsx')

Answer: It is reasonable to scale the data, since the scales for different features differ a lot. For example, the mean for “AB” is 5518.7. But the mean for “OBP” is 0.3213667.

2.2

data_2 = my_data[,3:28]
data_2 = scale(data_2)
d = dist(data_2)
res=isoMDS(d,k=2)
## initial  value 19.856833 
## iter   5 value 16.319153
## iter  10 value 16.046215
## final  value 15.935476 
## converged
coords=res$points

coordsMDS = as.data.frame(coords)
coordsMDS$League = my_data$League
coordsMDS$Team = my_data$Team
# the following is the first way to use plot_ly to get a scatter plot.
plot <- plot_ly(data = coordsMDS, x = ~V1, y = ~V2, type = 'scatter', hovertext = ~Team, mode = 'markers', color = ~League, colors = c('blue','green'))
plot

Answer: There is a difference between the two leagues.From the graph, we can see the V2 component provides the best differentiation because the sports teams in AL are most located in that area.I think Boston Red Sox, Chicago Clubs, NY Mets and San Diego Padres are outliers.

2.3

sh <- Shepard(d, coords)
delta <-as.numeric(d) # which can reflect the original distance between observations
D<- as.numeric(dist(coords)) # which can reflect the distance between new observations

n=nrow(coords)
index=matrix(1:n, nrow=n, ncol=n)
index1=as.numeric(index[lower.tri(index)])

n=nrow(coords)
index=matrix(1:n, nrow=n, ncol=n, byrow = T)
index2=as.numeric(index[lower.tri(index)])

Teams = my_data$Team # the part relevant to "team" is used to give labels to datapoint.others are template.

plot_ly()%>%
  add_markers(x=~delta, y=~D, hoverinfo = 'text',
              text = ~paste('Obj1: ', Teams[index1],
                            '<br> Obj 2: ', Teams[index2]))%>%
  #if nonmetric MDS inolved
  add_lines(x=~sh$x, y=~sh$yf)

Answer:I think the point pairs like (Minnesota Twins, Aizona Diamondbacks) and (Oakland Athletics, Milwaukee Brewers) are not differetiated very well.

2.4

# the following is used to show a series of plots
plots = list()
for(i in 1:26){
  df = data.frame(x=coordsMDS$V2, y=data_2[,i], League=my_data$League)
  plot = ggplot(df,aes(x=x, y=y, color=League)) + geom_point() + labs(x="V2",y=colnames(data_2)[i])
  
  plots[[colnames(data_2)[i]]] = plot
}


combined_plots = grid.arrange(grobs = plots)

# I think the scatterplots with variables HR or StolenB can show strongest connections.

#the folowing is to show the picked two scatter plots
plots2 = list()
for(i in c(10,12)){
  df = data.frame(x=coordsMDS$V2, y=data_2[,i], League=my_data$League)
  plot = ggplot(df,aes(x=x, y=y, color=League)) + geom_point() + labs(x="V2",y=colnames(data_2)[i])
  
  plots2[[colnames(data_2)[i]]] = plot
}


combined_plots = grid.arrange(grobs = plots2)

Answer: The two leagues focus on different aspects: AL focus on power, while NL is more oriented towards pitching.So, the two variables are important for scoring in this game.